-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[X86] combineCONCAT_VECTORS - fold concat(extract_subvector(X,0),extract_subvector(Y,0)) --> shuffle(X,Y) #135985
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…act_subvector(Y,0)) --> shuffle(X,Y) This is a simple duplicate version of combineConcatVectorOfExtracts, which only runs generically pre-vector legalization. I'm open to alternatives on how to handle this - we could go with this; we could add a TLI hook to allow combineConcatVectorOfExtracts to run after legalization; or we could move combineConcatVectorOfExtracts into TargetLowering so we can call it from x86 combineCONCAT_VECTORS when we want. Any thoughts?
|
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesThis is a simple duplicate version of combineConcatVectorOfExtracts, which only runs generically pre-vector legalization. I'm open to alternatives on how to handle this - we could go with this; we could add a TLI hook to allow combineConcatVectorOfExtracts to run after legalization; or we could move combineConcatVectorOfExtracts into TargetLowering so we can call it from x86 combineCONCAT_VECTORS when we want. Any thoughts? Patch is 50.15 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135985.diff 13 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3adeb4628eabf..26bf076271f8e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58911,6 +58911,28 @@ static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG,
return R;
}
+ // Fold concat(extract_subvector(X,0),extract_subvector(Y,0))
+ // --> shuffle(X,Y)
+ // This is a more limited version of combineConcatVectorOfExtracts for use
+ // after legalization.
+ if (Ops.size() == 2 && TLI.isTypeLegal(VT)) {
+ SDValue Lo = Ops[0], Hi = Ops[1];
+ if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Lo.getOperand(0).getValueSizeInBits() == VT.getSizeInBits() &&
+ Hi.getOperand(0).getValueSizeInBits() == VT.getSizeInBits() &&
+ isNullConstant(Lo.getOperand(1)) && isNullConstant(Hi.getOperand(1))) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfElts = NumElts / 2;
+ SmallVector<int, 8> ConcatMask(NumElts);
+ std::iota(ConcatMask.begin(), ConcatMask.begin() + HalfElts, 0);
+ std::iota(ConcatMask.begin() + HalfElts, ConcatMask.end(), NumElts);
+ return DAG.getVectorShuffle(
+ VT, SDLoc(N), DAG.getBitcast(VT, Lo.getOperand(0)),
+ DAG.getBitcast(VT, Hi.getOperand(0)), ConcatMask);
+ }
+ }
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll
index 0748ca626bcf8..70335f834291d 100644
--- a/llvm/test/CodeGen/X86/combine-pmuldq.ll
+++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll
@@ -396,9 +396,8 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) {
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm2
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm4[1,3],ymm2[5,7],ymm4[5,7]
+; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: subq $-128, %rax
; AVX2-NEXT: jne .LBB7_1
@@ -568,13 +567,12 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) {
; AVX2-NEXT: .p2align 4
; AVX2-NEXT: .LBB8_1: # %loop
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX2-NEXT: vpmovsxdq 2097168(%rdi,%rax), %ymm2
-; AVX2-NEXT: vpmovsxdq 2097152(%rdi,%rax), %ymm3
+; AVX2-NEXT: vpmovsxdq 2097152(%rdi,%rax), %ymm2
+; AVX2-NEXT: vpmovsxdq 2097168(%rdi,%rax), %ymm3
; AVX2-NEXT: vpmuldq %ymm3, %ymm1, %ymm3
; AVX2-NEXT: vpmuldq %ymm2, %ymm1, %ymm2
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm4[1,3],ymm2[5,7],ymm4[5,7]
+; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: subq $-128, %rax
; AVX2-NEXT: jne .LBB8_1
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll
index 1e56f346030ca..2f0d419132492 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll
@@ -172,9 +172,8 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],ymm1[2,3]
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm3[0,2],ymm0[4,6],ymm3[4,6]
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpmaskmovd %ymm0, %ymm2, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
index 31ef44bd6b42b..c950ce64e8883 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
@@ -334,9 +334,8 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm0, %ymm4
; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],ymm1[2,3]
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm3[0,2],ymm0[4,6],ymm3[4,6]
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpmaskmovd %ymm0, %ymm2, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
index 590f090c59596..da057dd084b36 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
@@ -266,9 +266,8 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm4
; AVX2-NEXT: vpcmpgtq %ymm4, %ymm6, %ymm4
; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],ymm1[2,3]
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm3[0,2],ymm0[4,6],ymm3[4,6]
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpmaskmovd %ymm0, %ymm2, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/pr40891.ll b/llvm/test/CodeGen/X86/pr40891.ll
index 24f644683c286..73327491c4151 100644
--- a/llvm/test/CodeGen/X86/pr40891.ll
+++ b/llvm/test/CodeGen/X86/pr40891.ll
@@ -8,9 +8,8 @@ define <8 x i32> @foo(<8 x i64> %x, <4 x i64> %y) {
; CHECK: # %bb.0:
; CHECK-NEXT: vandps %ymm2, %ymm0, %ymm0
; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; CHECK-NEXT: retl
%a = shufflevector <4 x i64> %y, <4 x i64> <i64 12345, i64 67890, i64 13579, i64 24680>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%b = and <8 x i64> %x, %a
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index 61e3611dcedc9..73ee28a7fd247 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -3,9 +3,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE2OR3,SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
@@ -2846,60 +2846,22 @@ define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) {
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: test33:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm2, %ymm4
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
-; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295]
-; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2
-; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
-; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6]
-; AVX2-SLOW-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpsubd %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-ALL-LABEL: test33:
-; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm1, %ymm4
-; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
-; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
-; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295]
-; AVX2-FAST-ALL-NEXT: vblendvpd %ymm4, %ymm1, %ymm6, %ymm1
-; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm4, %ymm1
-; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm2, %ymm3
-; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
-; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm2, %ymm6, %ymm2
-; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm4, %ymm2
-; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-FAST-ALL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT: vpsubd %ymm1, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT: retq
-;
-; AVX2-FAST-PERLANE-LABEL: test33:
-; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm2, %ymm4
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
-; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
-; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295]
-; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm1, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
-; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6]
-; AVX2-FAST-PERLANE-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpsubd %ymm1, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: retq
+; AVX2-LABEL: test33:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm4
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
+; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295]
+; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2
+; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3
+; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
+; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
+; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test33:
; AVX512: # %bb.0:
@@ -3070,66 +3032,24 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) {
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: test34:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
-; AVX2-SLOW-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm2, %ymm4
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
-; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295]
-; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2
-; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
-; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6]
-; AVX2-SLOW-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpsubd %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-ALL-LABEL: test34:
-; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
-; AVX2-FAST-ALL-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm1, %ymm4
-; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
-; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
-; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295]
-; AVX2-FAST-ALL-NEXT: vblendvpd %ymm4, %ymm1, %ymm6, %ymm1
-; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm4, %ymm1
-; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm2, %ymm3
-; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
-; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm2, %ymm6, %ymm2
-; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm4, %ymm2
-; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-FAST-ALL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT: vpsubd %ymm1, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT: retq
-;
-; AVX2-FAST-PERLANE-LABEL: test34:
-; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
-; AVX2-FAST-PERLANE-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm2, %ymm4
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
-; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
-; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295]
-; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2
-; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm1, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
-; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6]
-; AVX2-FAST-PERLANE-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpsubd %ymm1, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: retq
+; AVX2-LABEL: test34:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm4
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
+; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295]
+; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2
+; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3
+; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
+; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
+; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test34:
; AVX512: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
index 80b4f4614383f..320dce840ea57 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
@@ -585,9 +585,8 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) nounwind {
;
; AVX2-LABEL: trunc_v8i64_v8i1:
; AVX2: # %bb.0:
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vmovmskps %ymm0, %eax
; AVX2-NEXT: testb %al, %al
diff --git a/llvm/test/CodeGen/X86/vector-trunc-nowrap.ll b/llvm/test/CodeGen/X86/vector-trunc-nowrap.ll
index 85cca4f6f9a57..a47e6de2a8cef 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-nowrap.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-nowrap.ll
@@ -3,9 +3,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE2-SSSE3,SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-vari...
[truncated]
|
|
I checked several tests, it seems it only happens when |
|
Agreed, I will have another look at seeing whether we can get the existing generic folds to fire for this one |
This is a simple duplicate version of combineConcatVectorOfExtracts, which only runs generically pre-vector legalization.
I'm open to alternatives on how to handle this - we could go with this; we could add a TLI hook to allow combineConcatVectorOfExtracts to run after legalization; or we could move combineConcatVectorOfExtracts into TargetLowering so we can call it from x86 combineCONCAT_VECTORS when we want. Any thoughts?